Setting the options for knitr…
library(knitr)
knitr::opts_chunk$set(results = 'asis',
echo = TRUE,
comment = NA,
prompt = FALSE,
cache = FALSE,
warning = FALSE,
message = FALSE,
fig.align = "center",
fig.width = 8.125,
out.width = "100%",
fig.path = "Figures/figures_pdfa/",
dev = c('png', 'tiff'),
dev.args = list(png = list(type = "cairo"), tiff = list(compression = 'lzw')),
cache = TRUE,
cache.path = "D:/cache/cache_pdfa/",
autodep = TRUE)
options(width = 1000, scipen = 999, knitr.kable.NA = '')Loading libraries…
library(dplyr)
library(purrr)
library(forcats)
library(MASS)
library(ggplot2)
library(caret)
library(kableExtra)
library(pROC)
library(ggrepel)
library(GGally)
library(splitstackshape)
library(fmsb)Functions to reorder a factor for proper display (Source: https://github.com/dgrtwo/drlib/blob/master/R/reorder_within.R)
reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
new_x <- paste(x, within, sep = sep)
stats::reorder(new_x, by, FUN = fun)}
scale_x_reordered <- function(..., sep = "___") {
reg <- paste0(sep, ".+$")
ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)}Function to compute VIF and assess multicolinearity… (Source: https://beckmw.wordpress.com/2013/02/05/collinearity-and-stepwise-vif-selection/)
vif_func <- function(in_frame,thresh=10,trace=T,...){
if(any(!'data.frame' %in% class(in_frame))) in_frame<-data.frame(in_frame)
# Getting the initial vif values for all the comparisons of the variables
vif_init<-NULL
var_names <- names(in_frame)
for(val in var_names){
regressors <- var_names[-which(var_names == val)]
form <- paste(regressors, collapse = '+')
form_in <- formula(paste(val, '~', form))
vif_init<-rbind(vif_init, c(val, VIF(lm(form_in, data = in_frame, ...))))
}
# Computing the maximum vif value
vif_max<-max(as.numeric(vif_init[,2]), na.rm = TRUE)
# Testing whether we have VIF values larger than our threshold
if(vif_max < thresh){
# Printing the output of the first iteration if trace is TRUE
if(trace) {
prmatrix(vif_init,collab=c('var','vif'),rowlab=rep('',nrow(vif_init)),quote=F)
cat('\n')
cat(paste('All variables have VIF < ', thresh,', max VIF ',round(vif_max,2), sep=''),'\n\n')
}
return(var_names)
}
else {
in_dat <- in_frame
# We perform a backward selection of the explanatory variables, and stop when all VIF values are below 'thresh'
while(vif_max >= thresh){
vif_vals<-NULL
var_names <- names(in_dat)
for(val in var_names){
regressors <- var_names[-which(var_names == val)]
form <- paste(regressors, collapse = '+')
form_in <- formula(paste(val, '~', form))
vif_add<-VIF(lm(form_in, data = in_dat, ...))
vif_vals<-rbind(vif_vals,c(val,vif_add))
}
# Computing the maximum vif value
max_row <- which(vif_vals[,2] == max(as.numeric(vif_vals[,2]), na.rm = TRUE))[1]
vif_max <- as.numeric(vif_vals[max_row,2])
# If there no longer VIF values larger than our threshold, we stop the procedure
if(vif_max < thresh) break
# Printing the output of the current iteration if trace is TRUE
if(trace) {
prmatrix(vif_vals,collab=c('var','vif'),rowlab=rep('',nrow(vif_vals)),quote=F)
cat('\n')
cat('removed: ',vif_vals[max_row,1],vif_max,'\n\n')
flush.console()
}
in_dat <- in_dat[,!names(in_dat) %in% vif_vals[max_row,1]]
}
return(names(in_dat))
}
}Function to return a ggplot figure displaying a confusion matrix…
ggplotConfusionMatrix <- function(m){
caption <- paste(
"Log Loss", round(resultats[[DV]][[set_name]]$Log.Loss, 4),
"Kappa:", round(resultats[[DV]][[set_name]]$Kappa, 4),
"Multi-class AUC:", round(resultats[[DV]][[set_name]]$AUC, 4),
"Acc.:", round(resultats[[DV]][[set_name]]$Acc, 4),
"Bal. Acc.:", round(resultats[[DV]][[set_name]]$Bal.Acc, 4),
sep = " - ")
p <- m$table %>%
data.frame() %>%
group_by(observed) %>%
mutate(
total = sum(Freq),
frac = (Freq / total)*100
) %>%
ggplot(aes(predicted, observed)) +
geom_tile(aes(fill = Freq), colour = "white", show.legend = FALSE) +
scale_fill_distiller(palette = "YlOrBr", direction = 1) +
geom_text(aes(label = Freq ), vjust = 1, size=3) +
theme(legend.position = "none") +
labs(title = "Confusion Matrix (100 iterations - Testing set)",
subtitle = paste(DV, set_name, sep=" - "),
caption = caption
) +
theme_classic()
return(p)
}We load data tables and definitions of features sets prepared during the curation phase…
load(file = "Study_objects.RData")Defining the predicted variables (individual, type…)…
DVs <- c("individual", "type")We have previously defined three feature sets: Bioacoustics, DCT and MFCC. For the pDFA, we are going to keep only the first two:
The MFCC predictor set is not used with pDFAs because of its high dimensionality and high collinearity that inadequately meet the assumptions of discriminant analysis (the size of the smallest class must be larger than the number of predictors)
set_names <- c("Bioacoustic", "DCT")We create a (named) list of these sets…
SETs <- list(Bioacoustic_set, DCT_set)The discriminant analysis does not automatically handle multicollinearity between predictors, which should be explicitly addressed.
A way to identify collinearity is to compute a variance inflation factor (VIF) for each explanatory variable. If we regress one explanatory variable by other explanatory variables, we obtain the coefficient of determination \(R^2\). VIF is defined by the following equation: \(VIF=1/(1-R^2)\).
If no explanatory variables are collinear, the VIF values will be all around 1. If the VIF is greater than 1, the predictors may be moderately collinear. A VIF between 5 and 10 indicates high collinearities that may be problematic. And if the VIF goes above 10, you could assume that the regression coefficients are poorly estimated due to multicollinearity.
“Montgomery & Peck (1992) used a value of 10, but a more stringent approach is to use values as low as 3 as we did here. High, or even moderate, collinearity is especially problematic when ecological signals are weak. In that case, even a VIF of 2 may cause non-significant parameter estimates, compared to the situation without collinearity.” — Zuur et al. (2010: 9)
As suggested by Zuur et al. (2010: 9), we use a stepwise strategy. The VIF values for all explanatory variables are computed first. If at least one of the VIF value is larger than a preselected threshold (here, 5), we drop the explanatory variable with the largest VIF. We then recalculate the VIFs and repeat the process until all VIFs are below the preselected threshold.
This custom stepwise VIF function has been built by Marcus W. Beck.
We reduce our Bioacoustic and DCT sets
df.set2 <- df %>%
dplyr::select(all_of(SETs[[1]]))
Bioacoustic_set <- vif_func(in_frame = df.set2, thresh = 5, trace = F)
df.set2 <- df %>%
dplyr::select(all_of(SETs[[2]]))
DCT_set <- vif_func(in_frame = df.set2, thresh = 5, trace = F)We update our SETs object with the reduced Bioacoustic and DCT feature sets
SETs <- list(Bioacoustic_set, DCT_set)knitr::combine_words(Bioacoustic_set)duration, vocalization.HNR, q1f, q3f, f.max, q1t, q3t, t.max, f0.end, f0.slope.asc, and f0.slope.desc
knitr::combine_words(DCT_set)duration, vocalization.HNR, dct0, dct1, dct2, dct3, and dct4
In order to analyze the information conveyed by the individual signature as well as the call types, we implemented a permuted discriminant analysis (pDFA) inspired by Mundry and Sommer (2007) and Keenan et al. (2020), which directly aimed to control the aforementioned imbalance in our data.
“In the first step of the DFA, a training data set was used to generate a set of linear discriminant functions. The training data set consisted of randomly selected sounds from each individual. The number of sounds selected per individual was the same for all individuals and equal to two-thirds of the smallest number of sounds that we obtained for an animal in our data set. In the second step, the discriminant functions generated from the training data set were used to classify the remaining sounds. For each individual, at least one-third of the sound provided by each individual was thus included in the validating data set. This cross-validation step gives a measure of the effect size (the percentage of correctly classified sounds, which has to be compared with chance, here 20%, i.e. 1/5 possible call types). We ran 100 iterations of these two-step DFAs with both training and validation data sets chosen at random. The mean effect size (mean percentage of correctly classified sounds) was obtained by calculating the average of the percentages of correctly classified sounds obtained with each of the 100 validation data sets.” — Keenan et al. (2020)
In the first step of this permuted form of discriminant analysis, a training sample was used to compute a set of linear discriminant functions. In order to test the call type distinctiveness, the training sample was randomly selected from each call type without controlling for the individual. Conversely, in order to test the individual vocal signature, the training sample was randomly selected from each individual’s vocalizations. The number of occurrences selected was the same for each call type and each individual. This number was equal to two thirds of the smallest number of vocalizations analyzed per call type (i.e., 206×2/3) and per individual (i.e., 71×2/3). These discriminant analysis models computed from training samples were used to classify all the remaining occurrences. Thus, for each individual and for each type of vocalization, at least one third of the occurrences were included in the test sample. We performed 100 iterations of these discriminant analyses with randomly selected training and test samples. The performance metrics and the percentage of correct classification were obtained by averaging the results obtained for each of the 100 test samples.
n.sel <- 100 # Number of iterations
# We initialize a number of objects to store data
CONTROL <- NULL
graphs <- list()
graphs.ld <- list()
graphs.classif <- list()
resultats <- list()
plot.confusionmatrix <- list()
confusion.matrix <- list()
class2 <- list()
probabilities <- list()
probabilities2 <- list()
# We iterate over our different DV
for (DV in DVs) {
# We define the CONTROL variable
if (DV == "type") {
CONTROL <- "individual"
}
if (DV == "individual") {
CONTROL <- "type"
}
# Number of levels of the dependent variable
ntf.levels <- length(unique(df[[DV]]))
# We compute 2/3 of the smallest number of observations we can find in one of the levels of the DV
n.to.sel <- df %>%
group_by(DV = get(DV)) %>%
summarise(n = n() ) %>%
slice_min(n, with_ties = FALSE) %>%
pull() %>%
{round((2/3)*., digits=0)}
# We iterate over our different feature sets
for (i in 1:length(SETs)) {
# We initialize objects
class <- list()
confusion <- list()
ld <- list()
prop.ld <- list()
classif <- list()
# We collapse the names of the variable in order to build a formula for our model
SET2 <- paste(SETs[[i]], collapse = "+")
# We build object for the set under study
SET <- c(SETs[[i]], DV)
set_name <- set_names[i]
# We create a subset of our main data table which contains only the explanatory variable of the set under study, the DV and the control variable
df.set <- df %>%
dplyr::select(all_of(SET), all_of(CONTROL)) %>%
tibble::rownames_to_column(., var = "rowname")
for (k in 1:n.sel) {
set.seed(k) # Set the random seed for reproducibility
# We make a random partition of df.set so that we pick n.to.sel elements in each level of the factor DV to assemble a training set
# The test set is made of the remaining elements
samples <- stratified(df.set, DV, size = n.to.sel, bothSets = TRUE)
# Training set
training <- samples$SAMP1
training.class <- training %>% pull(all_of(DV))
training.data <- training %>% dplyr::select(all_of(SETs[[i]]))
training.prior <- training %>%
count(get(DV)) %>%
mutate(prop = prop.table(n)) %>%
pull() %>%
as.numeric()
# Test set
testing <- samples$SAMP2
testing.class <- testing %>% pull(all_of(DV))
testing.data <- testing %>% dplyr::select(all_of(SETs[[i]]))
testing.prior <- testing %>%
count(get(DV)) %>%
mutate(prop = prop.table(n)) %>%
pull() %>%
as.numeric()
# we define the DFA model with the trainig set
model <- paste("lda(", DV, "~", SET2,", prior=training.prior, data=training)", sep="")
# we compute the model and store the outputs in 'res'
res <- eval(parse(text=model))
training.pred <- predict(res)
training$pred <- training.pred$class
# We save the discriminant functions
prop.ld[[k]] <- res$svd^2/sum(res$svd^2)
# We save the LD coefficients
ld[[k]] <- training.pred$x
# We apply the model to the test set
testing.pred <- predict(res, testing.data, prior=testing.prior)
testing$pred <- testing.pred$class
# We extract prediction probabilities
probabilities[[k]] <- as.data.frame(testing.pred$posterior) %>%
bind_cols(control=testing[[CONTROL]],
observed=testing[[DV]],
rowname=as.numeric(testing$rowname))
# We compute the negative of the multinomial log-likelihood (smaller is better)
mnlogloss <- testing %>%
mutate(observed=as.factor(get(DV))) %>%
dplyr::select(observed) %>%
bind_cols(testing.pred$posterior) %>%
mnLogLoss(., lev = levels(.$observed)) %>% as.numeric()
# We create the confusion matrix for the current iteration
confusion[[k]] <- table(list(predicted=testing$pred, observed=testing[[DV]]))
# We extract the values of a few metrics
acc <- t(confusionMatrix(confusion[[k]])$overall[[1]])
kappa <- t(confusionMatrix(confusion[[k]])$overall[[2]])
auc <- multiclass.roc(response = testing[[DV]], predictor=testing.pred$posterior)$auc[1]
# The balanced accuracy in binary and multiclass classification problems is another option to address
# imbalanced datasets. It is defined as the average of recall obtained on each class.
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html
bal <- mean(confusionMatrix(confusion[[k]])[["byClass"]][,1])
# We get the total number of correct classifications in the training set for the current iteration
training.classif <- training %>%
mutate(sum = sum(pred == get(DV))) %>%
distinct(sum) %>%
pull() %>%
as.numeric()
# We compute the percentage of correct classification in the training set for the current iteration
training.pourc.classif <- training.classif / nrow(training)
# We get the total number of correct classifications in the test set for the current iteration
testing.classif <- testing %>%
mutate(sum = sum(pred == get(DV))) %>%
distinct(sum) %>%
pull() %>%
as.numeric()
# We compute the percentage of correct classification in the test set for the current iteration
testing.pourc.classif <- testing.classif/(nrow(testing))
# We create and store a data frame for the current iteration
# We save the number of correct classifications and percentage of correct classification for both training and test sets
# and also the different metrics for the test set
class[[k]] <- data.frame(DV, k,
n.training=nrow(training), training.classif, training.pourc.classif,
n.testing=nrow(testing), testing.classif, testing.pourc.classif,
mnlogloss, kappa, auc, acc, bal)
# We compute the rates of correct classification by level of the DV and by level of the CONTROL variable
classif[[k]] <- testing %>%
group_by(variable=testing[[DV]], control=testing[[CONTROL]]) %>%
summarise(
total = length(variable),
rate = sum(variable == pred) / total,
.groups = 'drop')
}
# We store the results for the DV and feature set under study
# 1. We bind the prediction probabilities for all the iterations
probabilities2[[DV]][[set_name]] <- do.call(rbind, probabilities)
# 2. We compute the average rates of correct classification for each level of the DV and each level of the CONTROL variable
classif2 <- do.call(rbind, classif) %>%
group_by(variable, control) %>%
summarize(rate = sum(rate) / n.sel, .groups = 'drop')
if (DV == "type")
classif2$variable <- factor(classif2$variable, levels = c("SCB", "B", "SB", "PY", "P"))
# We store a ggplot figure displaying the classification performance for the levels of the DV and those of the CONTROL variable
graphs.classif[[DV]][[set_name]] <- ggplot() +
geom_point(data = classif2, aes(x = variable, y = rate, color = variable), alpha = 0.65) +
geom_text_repel(data = classif2, aes(x = variable, y = rate, color = variable, label = control), min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = Inf, alpha=0.65) +
geom_point(data = classif2 %>% group_by(variable) %>% summarise(rate = mean(rate)), aes(variable, rate, fill = variable), colour="black", pch=21, size = 4, alpha=0.95) +
labs(x = "", y = "% Correct Classification", title = paste0("Testing dataset - ", DV, " - ", set_name)) +
theme_classic()
# We sum the values of the LDs for all the iterations and divide them by the number of iterations (n.sel)
ld2 <- Reduce('+', ld)/n.sel
ld2 <- data.frame(ld2, variable=training[[DV]])
# We sum the values of the variances of LDs for all the iterations and divide them by the number of iterations (n.sel)
prop.ld2 <- Reduce('+', prop.ld)/n.sel
# We store a ggplot figure displaying the LDs
graphs.ld[[DV]][[set_name]] <- ggpairs(ld2,
columns = 1:(length(ld2)-1),
aes(color = variable),
upper = "blank", legend=1,
lower = list(continuous = "points"),
diag = list(continuous = "densityDiag"),
axisLabels = "show") +
theme_bw() +
theme(legend.position = "bottom",
legend.title = element_blank()) +
labs(caption = paste0("Mean LDs: ", paste(round(prop.ld2*100, 2), collapse= "% "), "%"),
title = paste0("Training dataset - ", DV, " - ", set_name))
# We bind the clasification data for all the iterations
class2[[DV]][[set_name]] <- do.call("rbind", class)
# We store a data table containing the various assessments of classification performance
resultats[[DV]][[set_name]] <- data.frame(
"DV"=DV,
"N.Iter"=n.sel,
"N.Train"=round(mean(class2[[DV]][[set_name]]$n.training), 0),
"N.Correct.Train"=round(mean(class2[[DV]][[set_name]]$training.classif), 0),
"Perc.Correct.Train"=round(round(mean(class2[[DV]][[set_name]]$training.classif), 0)/round(mean(class2[[DV]][[set_name]]$n.training), 0)*100, 0),
"N.Test"=round(mean(class2[[DV]][[set_name]]$n.testing),0),
"N.Correct.Test"=round(mean(class2[[DV]][[set_name]]$testing.classif), 0),
"Perc.Correct.Test"=round(round(round(mean(class2[[DV]][[set_name]]$testing.classif), 0)/round(mean(class2[[DV]][[set_name]]$n.testing),0)*100, 0), 0),
"Log.Loss"=round(mean(class2[[DV]][[set_name]]$mnlogloss), 3),
"sd.Log.Loss"=round(sd(class2[[DV]][[set_name]]$mnlogloss), 3),
"Kappa"=round(mean(class2[[DV]][[set_name]]$kappa), 3),
"sd.Kappa"=round(sd(class2[[DV]][[set_name]]$kappa), 3),
"AUC"=round(mean(class2[[DV]][[set_name]]$auc), 3),
"sd.AUC"=round(sd(class2[[DV]][[set_name]]$auc), 3),
"Acc"=round(mean(class2[[DV]][[set_name]]$acc), 3),
"sd.Acc"=round(sd(class2[[DV]][[set_name]]$acc), 3),
"Bal.Acc"=round(mean(class2[[DV]][[set_name]]$bal), 3),
"sd.Bal.Acc"=round(sd(class2[[DV]][[set_name]]$bal), 3)
)
# We store a ggplot figure displaying the percentages of correct classification for the training set
graphs[[DV]][[set_name]][[1]] <- ggplot(data=class2[[DV]][[set_name]]) +
geom_point(aes(k, 100 * training.pourc.classif ), color = "darkorange3") +
geom_smooth(aes(k, 100 * training.pourc.classif ), method = "loess", se = FALSE, color = "black") + # Loess regression
geom_hline(yintercept = 100/ntf.levels, linetype = "dashed", color = "red") +
annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats[[DV]][[set_name]]$Perc.Correct.Train,"%"), hjust = 0) +
labs(x = "Iteration", y = "Percent of correct classification", title = paste0("Training dataset - ", DV, " - ", set_name)) +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
theme_classic()
seuils.ind <- testing %>%
group_by(get(DV)) %>%
summarise(n= (n()) ) %>%
mutate( prop = 100*(n / sum(n)) )
# We store a ggplot figure displaying the percentages of correct classification for the test set
graphs[[DV]][[set_name]][[2]] <- ggplot(data=class2[[DV]][[set_name]]) +
geom_point(aes(k, 100*testing.pourc.classif ), color="dodgerblue4") +
geom_smooth(aes(k, 100*testing.pourc.classif ), method = "loess", se = FALSE, color = "black") + # Loess regression
annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats[[DV]][[set_name]]$Perc.Correct.Test,"%"), hjust = 0) +
geom_hline(yintercept = seuils.ind$prop, linetype="dashed", color = "red") +
geom_text(data=seuils.ind, aes(0, prop, label = `get(DV)`), hjust = 0) +
labs(x = "Iteration",
y = "Percent of correct classification",
title = paste0("Testing dataset - ", DV, " - ", set_name)) +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
theme_classic()
# We sum cell by cell the content of all the confusion matrices (one per iteration)
# and we divide the values of the cells of the resulting matrix by the number of iterations (n.sel) to compute average values
confusion2 <- Reduce('+', confusion)/n.sel
confusion.matrix[[DV]][[set_name]] <- t(confusionMatrix(confusion2)$byClass)
# We create a figure for the average confusion matrix
plot.confusionmatrix[[DV]][[set_name]] <- ggplotConfusionMatrix(confusionMatrix(confusion2))
}
}
saveRDS(class2, file = "Result files/pdfa-permuted-class.rds")
saveRDS(resultats, file = "Result files/pdfa-permuted-resultats01.rds")
saveRDS(probabilities2, file = "Result files/pdfa-permuted-probabilities.rds")graphs.ld %>%
walk2(names(.), ~ print(.x ))$Bioacoustic
$DCT
$Bioacoustic
$DCT
graphs %>%
walk2(names(.), ~ print(.x ))$Bioacoustic $Bioacoustic[[1]]
$Bioacoustic[[2]]
$DCT $DCT[[1]]
$DCT[[2]]
$Bioacoustic $Bioacoustic[[1]]
$Bioacoustic[[2]]
$DCT $DCT[[1]]
$DCT[[2]]
resultats %>%
dplyr::bind_rows() %>%
walk2(names(.), ~ print(kable(.x, caption = paste0("Sets of predictors: ", .y),
digits=5,
col.names= c("DV", "N Iter.", "N", "N Correct Classif.", "Perc", "N", "N Correct Classif.", "Perc.", "Log Loss", "sd Log Loss", "Kappa", "sd Kappa", "Multi-class AUC", "sd AUC", "Acc.", "sd Acc.", "Bal. Acc.", "sd Bal. Acc.") ) %>%
kable_styling() %>%
add_header_above(c(" " = 2, "Original Training Set" =3, "Original Testing Test" = 3, "Evaluation Metrics" = 10)) ))| DV | N Iter. | N | N Correct Classif. | Perc | N | N Correct Classif. | Perc. | Log Loss | sd Log Loss | Kappa | sd Kappa | Multi-class AUC | sd AUC | Acc. | sd Acc. | Bal. Acc. | sd Bal. Acc. |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| individual | 100 | 470 | 172 | 37 | 1090 | 447 | 41 | 1.745 | 0.042 | 0.267 | 0.016 | 0.731 | 0.008 | 0.410 | 0.015 | 0.236 | 0.012 |
| type | 100 | 685 | 452 | 66 | 875 | 570 | 65 | 0.819 | 0.025 | 0.526 | 0.021 | 0.894 | 0.006 | 0.652 | 0.015 | 0.588 | 0.018 |
| DV | N Iter. | N | N Correct Classif. | Perc | N | N Correct Classif. | Perc. | Log Loss | sd Log Loss | Kappa | sd Kappa | Multi-class AUC | sd AUC | Acc. | sd Acc. | Bal. Acc. | sd Bal. Acc. |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| individual | 100 | 470 | 155 | 33 | 1090 | 468 | 43 | 1.698 | 0.056 | 0.288 | 0.016 | 0.731 | 0.009 | 0.430 | 0.016 | 0.235 | 0.010 |
| type | 100 | 685 | 464 | 68 | 875 | 590 | 67 | 0.813 | 0.020 | 0.554 | 0.018 | 0.901 | 0.006 | 0.675 | 0.013 | 0.596 | 0.017 |
plot.confusionmatrix %>%
walk2(names(.), ~ print(.x ))$Bioacoustic
$DCT
$Bioacoustic
$DCT
graphs.classif %>%
walk2(names(.), ~ print(.x ))$Bioacoustic
$DCT
$Bioacoustic
$DCT
“In addition to the cross-validated DFAs performed on original datasets, new data sets were also created, where the identity of sounds was randomly permuted between individuals (pDFA), to obtain the statistical significance of the mean effect size. From these randomized sets, the same steps,fitting and validation, were performed consecutively. After 1000 iterations, we calculated the proportion of randomized validation data sets revealing a number of correctly classified calls being at least as large as the effect size obtained with the non randomized validation data set. This proportion gives the significance of the discrimination level and is equivalent to a Pvalue (Dentressangle et al., 2012 ; Mathevon et al., 2010 ; Mundry & Sommer, 2007).” — Keenan et al. (2020)
In order to statistically compare the performance reached by discriminant analysis to the chance level, additional discriminant analyses were also performed on modified datasets created by recombination, following the permuted DFA paradigm. Recombination consists in assigning a class label (call type or individual identity, depending on the task considered) to an observation by a random permutation over the dataset. Training and test sets were then drawn following the same rules as for the original dataset, and a DFA was applied, leading to an accuracy characteristic of a random classification. This procedure was iterated 1,000 times, resulting in a robust estimation of the distribution of random accuracies. Empirical p-value of the performance reached with the original discriminant analysis was then obtained by dividing the number of randomized datasets that revealed a percentage of correctly classified calls or individuals at least as large as the one obtained for the original dataset by the total number of datasets tested (Mundry & Sommer, 2007 ; Keenan et al., 2020).
n.perm <- 1000 # Number of permutations
p.train <- 1
p.test <- 1
resultats2 <- list()
perm.class2 <- list()
graphs.permutations <- list()
for (DV in DVs) {
ntf.levels <- length(unique(df[[DV]]))
n.to.sel <- df %>%
group_by(DV = get(DV)) %>%
summarise(n= n() ) %>%
slice_min(n, with_ties = FALSE) %>%
pull() %>%
{round((2/3)*.,digits=0)}
for (i in 1:length(SETs)) {
perm.class <- list()
perm.confusion <- list()
APPEL_MODEL <- paste(SETs[[i]], collapse = "+")
SET <- c(SETs[[i]], DV)
set_name <- set_names[i]
df.set.ini <- df %>% dplyr::select(all_of(SET))
for (k in 1:n.perm) {
set.seed(k)
# we shuffle the values of DV
df.set <- slice(df.set.ini, sample(1:n()))
df.set[[DV]] <- df.set.ini[[DV]]
echantillons <- stratified(df.set, DV, size = n.to.sel, bothSets = TRUE)
training <- echantillons$SAMP1
training.class <- training %>% pull(all_of(DV))
training.data <- training %>% dplyr::select(all_of(SETs[[i]]))
training.prior <- training %>%
count(get(DV)) %>%
mutate(prop = prop.table(n)) %>%
pull() %>%
as.numeric()
testing <- echantillons$SAMP2
testing.class <- testing %>% pull(all_of(DV))
testing.data <- testing %>% dplyr::select(all_of(SETs[[i]]))
testing.prior <- testing %>%
count(get(DV)) %>%
mutate(prop = prop.table(n)) %>%
pull() %>%
as.numeric()
model <- paste("lda(", DV, "~", APPEL_MODEL,", prior=training.prior, data=training)", sep="")
res <- eval(parse(text=model))
training.pred <- predict(res)
training$pred <- training.pred$class
testing.pred <- predict(res, testing.data, prior = testing.prior)
testing$pred <- testing.pred$class
probabilities <- as.data.frame(testing.pred$posterior)
perm.confusion[[k]] <- table(list(predicted=testing$pred, observed=testing[[DV]]))
perm.training.classif <- training %>%
mutate(sum = sum(pred==get(DV))) %>%
distinct(sum) %>%
pull() %>%
as.numeric()
perm.training.pourc.classif <- perm.training.classif / (nrow(training))
perm.testing.classif <- testing %>%
mutate(sum = sum(pred==get(DV))) %>%
distinct(sum) %>%
pull() %>%
as.numeric()
perm.testing.pourc.classif <- perm.testing.classif / (nrow(testing))
perm.class[[k]] <- data.frame(k, perm.n.training=nrow(training), perm.training.classif, perm.training.pourc.classif, perm.n.testing=nrow(testing), perm.testing.classif, perm.testing.pourc.classif)
if (perm.training.classif >= resultats[[DV]][[set_name]]$N.Correct.Train)
p.train = p.train + 1
if (perm.testing.classif >= resultats[[DV]][[set_name]]$N.Correct.Test)
p.test = p.test + 1
}
perm.class2[[DV]][[set_name]] <- do.call("rbind", perm.class)
resultats2[[DV]][[set_name]] <- data.frame(
"DV"=DV,
"N.Iter"=n.perm,
"N.Train"=round(mean(perm.class2[[DV]][[set_name]]$perm.n.training), 0),
"N.Correct.Train"=round(mean(perm.class2[[DV]][[set_name]]$perm.training.classif), 0),
"Perc.Correct.Train"=round(round(mean(perm.class2[[DV]][[set_name]]$perm.training.classif), 0)/round(mean(perm.class2[[DV]][[set_name]]$perm.n.training), 0)*100, 0),
"P.Value.Train"=p.train/n.perm,
"N.Test"=round(mean(perm.class2[[DV]][[set_name]]$perm.n.testing),0),
"N.Correct.Test"=round(mean(perm.class2[[DV]][[set_name]]$perm.testing.classif), 0),
"Perc.Correct.Test"=round(round(mean(perm.class2[[DV]][[set_name]]$perm.testing.classif), 0)/round(mean(perm.class2[[DV]][[set_name]]$perm.n.testing),0)*100, 0),
"P.Value.Test"=p.test/n.perm
)
graphs.permutations[[DV]][[set_name]][[1]] <- ggplot(data=perm.class2[[DV]][[set_name]]) +
geom_point(aes(k, 100*perm.training.pourc.classif ), color="darkorange3") +
geom_smooth(aes(k, 100*perm.training.pourc.classif ), method = "loess", se = FALSE, color = "black") +
geom_hline(yintercept=100/ntf.levels, linetype="dashed", color = "red") +
annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats2[[DV]][[set_name]]$Perc.Correct.Train, "%"), hjust = 0) +
labs(x = "Iteration",
y = "Percent of correct classification",
title = paste0("Training permuted dataset - ", DV, " - ", set_name)) +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
theme_classic()
seuils.ind <- testing %>%
group_by(get(DV)) %>%
summarise(n= (n()) ) %>%
mutate( prop = 100*(n / sum(n)) )
graphs.permutations[[DV]][[set_name]][[2]] <- ggplot(data=perm.class2[[DV]][[set_name]]) +
geom_point(aes(k, 100*perm.testing.pourc.classif ), color="dodgerblue4") +
geom_smooth(aes(k, 100*perm.testing.pourc.classif ), method = "loess", se = FALSE, color = "black") + # régression de loess
annotate("text", x = 0, y = 100, label = paste("Average correct classification: ", resultats2[[DV]][[set_name]]$Perc.Correct.Test, "%"), hjust = 0) +
geom_hline(yintercept = seuils.ind$prop, linetype="dashed", color = "red") +
geom_text(data=seuils.ind, aes(0, prop, label = `get(DV)`), hjust = 0) +
labs(x = "Iteration",
y = "Percent of correct classification",
title = paste0("Testing permuted dataset - ", DV, " - ", set_name)) +
scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 20)) +
theme_classic()
}
}We save the results in .RDS files:
saveRDS(perm.class2, file = "Result files/pdfa-permuted-perm.class.rds")
saveRDS(resultats2, file = "Result files/pdfa-permuted-resultats02.rds")graphs.permutations %>%
walk2(names(.), ~ print(.x ))$Bioacoustic $Bioacoustic[[1]]
$Bioacoustic[[2]]
$DCT $DCT[[1]]
$DCT[[2]]
$Bioacoustic $Bioacoustic[[1]]
$Bioacoustic[[2]]
$DCT $DCT[[1]]
$DCT[[2]]
resultats2 %>%
dplyr::bind_rows()%>%
walk2(names(.), ~ print(kable(.x, caption = paste0("Sets of predictors: ", .y),
digits=5,
col.names= c("DV", "N Iter.", "N", "N Correct Classif.", "Perc.", "P value", "N", "N Correct Classif.", "Perc.", "P value") ) %>%
kable_styling() %>%
add_header_above(c(" " = 2, "Randomized Training Set" = 4, "Randomized Testing Test" = 4)) ))| DV | N Iter. | N | N Correct Classif. | Perc. | P value | N | N Correct Classif. | Perc. | P value |
|---|---|---|---|---|---|---|---|---|---|
| individual | 1000 | 470 | 89 | 19 | 0.001 | 1090 | 268 | 25 | 0.001 |
| type | 1000 | 685 | 185 | 27 | 0.001 | 875 | 287 | 33 | 0.001 |
| DV | N Iter. | N | N Correct Classif. | Perc. | P value | N | N Correct Classif. | Perc. | P value |
|---|---|---|---|---|---|---|---|---|---|
| individual | 1000 | 470 | 77 | 16 | 0.001 | 1090 | 285 | 26 | 0.001 |
| type | 1000 | 685 | 172 | 25 | 0.001 | 875 | 293 | 33 | 0.001 |
R version 4.1.3 (2022-03-10)
Platform: x86_64-w64-mingw32/x64
(64-bit)
OS version: Windows 10 x64 (build 22000)
stats: The R Stats Package version
4.1.3graphics: The R Graphics Package
version 4.1.3grDevices: The R Graphics Devices
and Support for Colours and Fonts version
4.1.3utils: The R Utils Package version
4.1.3datasets: The R Datasets Package
version 4.1.3methods: Formal Methods and
Classes version 4.1.3base: The R Base
Package version 4.1.3fmsb: Functions for
Medical Statistics Book with some Demographic Data version
0.7.3splitstackshape: Stack and Reshape Datasets
After Splitting Concatenated Values version
1.4.8GGally: Extension to ‘ggplot2’ version
2.1.2ggrepel: Automatically Position
Non-Overlapping Text Labels with ‘ggplot2’ version
0.9.1pROC: Display and Analyze ROC Curves
version 1.18.0kableExtra: Construct Complex Table
with ‘kable’ and Pipe Syntax version 1.3.4caret:
Classification and Regression Training version
6.0-90lattice: Trellis Graphics for R
version 0.20-45ggplot2: Create Elegant Data
Visualisations Using the Grammar of Graphics version
3.3.5MASS: Support Functions and Datasets for
Venables and Ripley’s MASS version
7.3-55forcats: Tools for Working with Categorical
Variables (Factors) version 0.5.1purrr:
Functional Programming Tools version
0.3.4dplyr: A Grammar of Data Manipulation
version 1.0.8knitr: A General-Purpose Package for
Dynamic Report Generation in R version 1.37
Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎
Université de Lyon, CNRS, Laboratoire Dynamique Du Langage, UMR 5596, Lyon, France↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
Département des arts, des lettres et du langage, Université du Québec à Chicoutimi, Chicoutimi, Canada↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
ENES Bioacoustics Research Laboratory, University of Saint Étienne, CRNL, CNRS UMR 5292, INSERM UMR_S 1028, Saint-Étienne, France↩︎
Department of Linguistics, The University of Hong Kong, Hong Kong, China, Corresponding author↩︎